Model09: refactoring more for _feat_avg_pos

A. Functions

There have four different functions.

  • Data reader: Read data from file.
  • Feature functions(private): Functions which extract features are placed in here. It means that if you make a specific feature function, you can add the one into here.
  • Feature function(public): We can use only this function for feature extraction.
  • Utility functions: All the funtions except functions which are mentioned in above should be placed in here.

Data reader


In [74]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign


"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data

Feature functions(private)


In [75]:
from numpy import sign, abs


def _feat_basic(bd, group):
    X = []
    for item in bd[group].items():
        qid = item[1]['qid']
        q = bd['questions'][qid]
        item[1]['q_length'] = max(q['pos_token'].keys())
        item[1]['category'] = q['category'].lower()
        item[1]['answer'] = q['answer'].lower()
        X.append(item[1])
        
    return X
        
        
def _feat_sign_val(data):
    for item in data:
        item['sign_val'] = sign(item['position'])


def _get_avg_pos(data, sign_val=None):
    unwanted_index = []
    pos_uid = defaultdict(list)
    pos_qid = defaultdict(list)
    
    for index, key in enumerate(data):
        if sign_val and sign(data[key]['position']) != sign_val:
            unwanted_index.append(index)
        else:
            pos_uid[data[key]['uid']].append(data[key]['position'])
            pos_qid[data[key]['qid']].append(data[key]['position'])

    avg_pos_uid = {}
    avg_pos_qid = {}
    
    if not sign_val:
        sign_val = 1

    for key in pos_uid:
        pos = abs(pos_uid[key])
        avg_pos_uid[key] = sign_val * (sum(pos) / len(pos))

    for key in pos_qid:
        pos = abs(pos_qid[key])
        avg_pos_qid[key] = sign_val * (sum(pos) / len(pos))
    
    return avg_pos_uid, avg_pos_qid, unwanted_index

        
def _feat_avg_pos(data, bd, group, sign_val):
    avg_pos_uid, avg_pos_qid, unwanted_index = _get_avg_pos(bd['train'], sign_val=sign_val)
    
    if group == 'train':
        for index in sorted(unwanted_index, reverse=True):
            del data[index]
    
    for item in data:
        if item['uid'] in avg_pos_uid:
            item['avg_pos_uid'] = avg_pos_uid[item['uid']]
        else:
            vals = avg_pos_uid.values()
            item['avg_pos_uid'] = sum(vals) / float(len(vals))
              
        if item['qid'] in avg_pos_qid:
            item['avg_pos_qid'] = avg_pos_qid[item['qid']]
        else:
            vals = avg_pos_qid.values()
            item['avg_pos_qid'] = sum(vals) / float(len(vals))
        
        # Response position can be longer than length of question
        if item['avg_pos_uid'] > item['q_length']:
            item['avg_pos_uid'] = item['q_length']
        
        if item['avg_pos_qid'] > item['q_length']:
            item['avg_pos_qid'] = item['q_length']

Feature function(public)


In [76]:
def featurize(bd, group, sign_val=None, extra=None):
    # Basic features
    # qid(string), uid(string), position(float)
    # answer'(string), 'potistion'(float), 'qid'(string), 'uid'(string)
    X = _feat_basic(bd, group=group)
    
    # Some extra features
    if extra:
        for func_name in extra:
            func_name = '_feat_' + func_name
            if func_name in ['_feat_avg_pos']:
                globals()[func_name](X, bd, group=group, sign_val=sign_val)
            else:
                globals()[func_name](X)
    
    if group == 'train':
        y = []
        for item in X:
            y.append(item['position'])
            del item['position']

        return X, y
    elif group == 'test':
        return X
    else:
        raise ValueError(group, 'is not the proper type')

Utility functions


In [77]:
import csv


def select(data, keys):
    unwanted = data[0].keys() - keys
    for item in data:
        for unwanted_key in unwanted:
            del item[unwanted_key]
    return data


def write_result(test_set, predictions, file_name='guess.csv'):
    predictions = sorted([[id, predictions[index]] for index, id in enumerate(test_set.keys())])
    predictions.insert(0,["id", "position"])
    with open(file_name, "w") as fp:
        writer = csv.writer(fp, delimiter=',')
        writer.writerows(predictions)

B. Modeling

Select model


In [78]:
import multiprocessing
from sklearn import linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction import DictVectorizer
import math
from numpy import abs, sqrt


regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_qid', 'avg_pos_pid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['sign_val', 'avg_pos'])
X_train = select(X_train, regression_keys)

vec = DictVectorizer()
X_train = vec.fit_transform(X_train)

regressor_names = """
LinearRegression
Ridge
Lasso
ElasticNet
"""
print ("=== Linear Cross validation RMSE scores:")
for regressor in regressor_names.split():
    scores = cross_val_score(getattr(linear_model, regressor)(),
                             X_train, y_train,
                             cv=10,
                             scoring='mean_squared_error',
                             n_jobs=multiprocessing.cpu_count()-1
                            )
    print (regressor, sqrt(abs(scores)).mean())


=== Linear Cross validation RMSE scores:
LinearRegression 82.8796245613
Ridge 85.3747383716
Lasso 84.7536490473
ElasticNet 84.9247541352

Training and testing model


In [79]:
regression_keys = ['category', 'q_length', 'qid', 'uid', 'answer', 'avg_pos_qid', 'avg_pos_pid']
X_train, y_train = featurize(load_buzz(), group='train', sign_val=None, extra=['avg_pos'])
X_train = select(X_train, regression_keys)
X_test = featurize(load_buzz(), group='test', sign_val=None, extra=['avg_pos'])
X_test = select(X_test, regression_keys)

vec = DictVectorizer()
vec.fit(X_train + X_test)
X_train = vec.transform(X_train)
X_test = vec.transform(X_test)

In [80]:
regressor = linear_model.LassoCV()
regressor.fit(X_train, y_train)


Out[80]:
LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [81]:
print(regressor.coef_)
print(regressor.alpha_)


[  0.00000000e+00   0.00000000e+00  -0.00000000e+00 ...,   0.00000000e+00
   4.28043156e-05  -1.87619270e-02]
146.866398809

In [82]:
predictions = regressor.predict(X_test)

Writing result


In [83]:
write_result(bd['test'], predictions)

This submissions scores 84.32000 in Kaggle.